In [ ]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os
import h3
import itertools
from tqdm import tqdm
from multiprocessing import Pool
from scipy.stats import norm
from collections import defaultdict
from sklearn.cluster import AgglomerativeClustering
from mirrorverse.warehouse.utils import get_engine
from mirrorverse.chinook.states import spatial_key_to_index
pd.options.mode.chained_assignment = None
os.environ["DATABASE_URL"] = "sqlite:////workspaces/mirrorverse/mirrorverse.db"
Load the Data¶
In [ ]:
sql = '''
select
tag_key,
date_key,
depth,
epoch
from
tag_depths
'''
depth = pd.read_sql(sql, get_engine())
print(depth.shape)
depth.head()
2024-05-14 13:29:25,205 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-14 13:29:25,206 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("
select
tag_key,
date_key,
depth,
epoch
from
tag_depths
")
2024-05-14 13:29:25,206 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-14 13:29:25,209 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("
select
tag_key,
date_key,
depth,
epoch
from
tag_depths
")
2024-05-14 13:29:25,210 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-14 13:29:25,211 INFO sqlalchemy.engine.Engine
select
tag_key,
date_key,
depth,
epoch
from
tag_depths
2024-05-14 13:29:25,213 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-14 13:29:28,119 INFO sqlalchemy.engine.Engine ROLLBACK
(1180821, 4)
Out[ ]:
| tag_key | date_key | depth | epoch | |
|---|---|---|---|---|
| 0 | 129843 | 1387411200 | 83.4 | 1387411200 |
| 1 | 129843 | 1387411200 | 80.7 | 1387412100 |
| 2 | 129843 | 1387411200 | 91.5 | 1387413000 |
| 3 | 129843 | 1387411200 | 91.5 | 1387413900 |
| 4 | 129843 | 1387411200 | 88.8 | 1387414800 |
Build or Load the Depth Classes¶
In [ ]:
def divide_among_classes(depth, depth_classes, min_value=0.01):
sd = depth * 0.08 / 1.96 # ~two standard deviations gives our 95% confidence interval
if sd == 0:
division = np.zeros(len(depth_classes))
division[0] == 1
else:
# we're going to assume the depth classes are sorted
z = (depth_classes - depth) / sd
division = norm.cdf(z)
division[1:] = division[1:] - division[:-1]
division[division < min_value] = 0
return division
min_size = 0.01
depth_classes = np.array([25, 50, 75, 100, 150, 200, 250, 300, 400, 500])
depth_classes_hash = '_'.join([str(e) for e in depth_classes])
file_prefix = f'depth_classes_{depth_classes_hash}'
if not os.path.exists(f'depth/{file_prefix}.csv'):
print('Adding Depth Classes...')
max_rows = 1000000
i = 0
rows = []
rows_scanned = 0
print('Total Rows to Scan:', depth.shape[0], '...')
for _, row in tqdm(depth.iterrows()):
rows_scanned += 1
membership = divide_among_classes(row['depth'], depth_classes, min_size)
for depth_class, size in zip(depth_classes, membership):
if size >= min_size:
new_row = {k:v for k,v in row.items()}
new_row['probability'] = size
new_row['depth_class'] = depth_class
rows.append(new_row)
if len(rows) >= max_rows:
pd.DataFrame(rows).to_csv(f'{file_prefix}_{i}.csv', index=False)
rows = []
i += 1
if len(rows):
pd.DataFrame(rows).to_csv(f'{file_prefix}_{i}.csv', index=False)
i += 1
print('Joining DataFrames...')
dfs = []
for j in range(i):
df = pd.read_csv(f'{file_prefix}_{j}.csv')
df['tag_key'] = df['tag_key'].astype(str)
dfs.append(df)
pd.concat(dfs).to_csv(f'depth/{file_prefix}.csv', index=False)
for j in range(i):
os.remove(f'{file_prefix}_{j}.csv')
depth = pd.read_csv(f'depth/{file_prefix}.csv')
depth['tag_key'] = depth['tag_key'].astype(str)
/tmp/ipykernel_3247/3790572311.py:56: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
depth = pd.read_csv(f'depth/{file_prefix}.csv')
Adding Context (Features)¶
In [ ]:
sql = '''
select
tt.*,
h.home_region,
e.elevation
from
tag_tracks tt
left join home_regions h
on tt.tag_key = h.tag_key
left join elevation e
on tt.h3_level_4_key = e.h3_level_4_key
'''
tt = pd.read_sql_query(
sql,
get_engine()
)
print(tt.head())
tt.head()
In [ ]:
depth = depth.merge(tt[['tag_key', 'date_key', 'longitude', 'latitude', 'home_region', 'elevation']])
depth.head()
In [ ]:
from suntimes import SunTimes
def get_sunrise(lat, lon, date):
return SunTimes(longitude=lon, latitude=lat, altitude=0).risewhere(date, 'UTC').hour
def get_sunset(lat, lon, date):
return SunTimes(longitude=lon, latitude=lat, altitude=0).setwhere(date, 'UTC').hour
depth['datetime'] = pd.to_datetime(depth['epoch'], utc=True, unit='s')
depth['date'] = depth['datetime'].dt.date
depth = depth[np.abs(depth['longitude']) <= 180]
depth['sunrise'] = depth.apply(
lambda r: get_sunrise(r['latitude'], r['longitude'], r['date']), axis=1
)
depth['sunset'] = depth.apply(
lambda r: get_sunset(r['latitude'], r['longitude'], r['date']), axis=1
)
depth['hour'] = depth['datetime'].dt.hour
depth['daytime'] = (depth['hour'] < depth['sunset']) | (depth['hour'] > depth['sunrise'])
depth.head()
In [ ]:
depth.to_csv(f'depth/{file_prefix}_context.csv', index=False)
In [ ]:
depth = pd.read_csv(f'depth/{file_prefix}_context.csv')
/tmp/ipykernel_3247/1546720778.py:1: DtypeWarning: Columns (0,8) have mixed types. Specify dtype option on import or set low_memory=False.
depth = pd.read_csv(f'depth/{file_prefix}_context.csv')
Function for Creating Vectors¶
In [ ]:
def build_vectors(depth, depth_classes, group_columns, window_size):
group_columns = list(group_columns) + ['window']
depth['window'] = depth['date_key'] - depth['date_key'] % (window_size)
df = depth.groupby(group_columns + ['depth_class']).agg({'probability': 'sum'}).reset_index()
df['sum_probability'] = df.groupby(group_columns)[['probability']].transform('sum')
df['overall_probability'] = df['probability'] / df['sum_probability']
vectors = defaultdict(lambda: np.zeros(len(depth_classes)))
indices = {
v: i
for i, v in enumerate(depth_classes)
}
print(df.shape[0])
for _, row in tqdm(df.iterrows()):
key = tuple([row[key] for key in group_columns])
vectors[key][indices[row['depth_class']]] = row['overall_probability']
features = []
directions = []
for feature, direction in vectors.items():
features.append(feature)
directions.append(direction)
return features, directions, depth.groupby(group_columns)[['epoch']].nunique().reset_index()
group_columns = ['tag_key', 'daytime']
window_size = 24 * 3600
features, vectors, counts = build_vectors(
depth, depth_classes, group_columns, window_size
)
47632
47632it [00:01, 40700.02it/s]
Function for Building Distances¶
In [ ]:
def build_distance_matrix(vectors):
vectors = [v / np.linalg.norm(v) for v in vectors]
distances = np.zeros((len(vectors), len(vectors)))
for i, v1 in tqdm(enumerate(vectors)):
for j, v2 in enumerate(vectors):
distance = 1 / (max(np.dot(v1, v2), 10 ** -6)) - 1
distances[i, j] = distance
return distances
print(len(vectors))
distances = build_distance_matrix(vectors)
13374
13374it [03:16, 68.14it/s]
The Clustering Itself¶
In [ ]:
N = 12
clustering = AgglomerativeClustering(n_clusters=N, metric="precomputed", connectivity=None, linkage="average")
clustering.fit(distances)
max_distance = 0
max_distances = {}
for label in range(N):
distance = distances[clustering.labels_ == label,:][:,clustering.labels_ == label].max()
max_distances[label] = distance
max_distance = max(max_distance, distance)
print(max_distance)
print(pd.Series(clustering.labels_).value_counts() / len(clustering.labels_))
max_distances
7.8574058313038595 9 0.516525 0 0.136982 2 0.129131 5 0.082773 4 0.076417 1 0.024002 3 0.020936 8 0.006729 11 0.002542 6 0.001869 7 0.001495 10 0.000598 Name: count, dtype: float64
Out[ ]:
{0: 7.8574058313038595,
1: 1.3485093761734581,
2: 2.4139092413219165,
3: 3.0454771796167295,
4: 2.5301855767672485,
5: 1.5620269473917259,
6: 0.2799164592527059,
7: 0.7085194189886073,
8: 1.3337028235103614,
9: 2.1271473408426465,
10: 0.40261818435928065,
11: 0.3643891176301597}
Add Class Labels to Depth Data¶
In [ ]:
rows = []
for feature_array, label in zip(features, clustering.labels_):
row = {}
for group, feature in zip(group_columns + ['window'], feature_array):
row[group] = feature
row['cluster'] = label
rows.append(row)
clusters_df = pd.DataFrame(rows)
depth['window'] = depth['date_key'] - depth['date_key'] % window_size
clusters_df = depth.merge(clusters_df)
clusters_df['datetime'] = pd.to_datetime(clusters_df['epoch'], utc=True, unit='s')
clusters_df['tag_key'] = clusters_df['tag_key'].astype(str)
clusters_df['cluster'] = clusters_df['cluster'].astype(str)
print(clusters_df.shape)
clusters_df.head()
(1290863, 18)
Out[ ]:
| tag_key | date_key | depth | epoch | probability | depth_class | longitude | latitude | home_region | elevation | datetime | date | sunrise | sunset | hour | daytime | window | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 129843 | 1387411200 | 83.4 | 1387411200 | 0.993199 | 100 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:00:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 |
| 1 | 129843 | 1387411200 | 80.7 | 1387412100 | 0.041772 | 75 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:15:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 |
| 2 | 129843 | 1387411200 | 80.7 | 1387412100 | 0.958228 | 100 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:15:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 |
| 3 | 129843 | 1387411200 | 91.5 | 1387413000 | 0.988571 | 100 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:30:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 |
| 4 | 129843 | 1387411200 | 91.5 | 1387413000 | 0.011424 | 150 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:30:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 |
View It!¶
In [ ]:
tag_key = '129843'
px.scatter(
clusters_df[clusters_df['tag_key'] == tag_key], x='datetime', y='depth', color='cluster'
)
In [ ]:
vector_sums = defaultdict(lambda: np.zeros(len(vectors[0])))
sizes = defaultdict(int)
for label, vector in zip(clustering.labels_, vectors):
vector_sums[label] += vector
sizes[label] += 1
mean_vectors = {
label: vector_sums[label] / sizes[label]
for label in clustering.labels_
}
rows = []
for label, vector in mean_vectors.items():
for depth_class, prob in zip(depth_classes, vector):
rows.append({
'cluster': str(label),
'depth_class': depth_class,
'probability': prob
})
groups_df = pd.DataFrame(rows)
groups_df.head()
Out[ ]:
| cluster | depth_class | probability | |
|---|---|---|---|
| 0 | 5 | 25 | 0.044002 |
| 1 | 5 | 50 | 0.045928 |
| 2 | 5 | 75 | 0.225012 |
| 3 | 5 | 100 | 0.549442 |
| 4 | 5 | 150 | 0.129686 |
In [ ]:
px.bar(
groups_df, x='depth_class', y='probability', color='cluster', facet_row='cluster', width=800, height=1200
)
Modeling¶
In [ ]:
clusters_df['month'] = clusters_df['datetime'].dt.month
clusters_df.head()
Out[ ]:
| tag_key | date_key | depth | epoch | probability | depth_class | longitude | latitude | home_region | elevation | datetime | date | sunrise | sunset | hour | daytime | window | cluster | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 129843 | 1387411200 | 83.4 | 1387411200 | 0.993199 | 100 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:00:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 | 12 |
| 1 | 129843 | 1387411200 | 80.7 | 1387412100 | 0.041772 | 75 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:15:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 | 12 |
| 2 | 129843 | 1387411200 | 80.7 | 1387412100 | 0.958228 | 100 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:15:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 | 12 |
| 3 | 129843 | 1387411200 | 91.5 | 1387413000 | 0.988571 | 100 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:30:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 | 12 |
| 4 | 129843 | 1387411200 | 91.5 | 1387413000 | 0.011424 | 150 | -166.922615 | 54.13176 | NaN | -184.870688 | 2013-12-19 00:30:00+00:00 | 2013-12-19 | 19 | 2 | 0 | True | 1387411200 | 5 | 12 |
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
tag_keys = clusters_df['tag_key'].unique()
train_keys = np.random.choice(tag_keys, int(0.8 * len(tag_keys)), replace=False)
test_keys = np.array(list(set(tag_keys) - set(train_keys)))
folds = 4
rows = []
np.random.shuffle(train_keys)
for i, tag_key in enumerate(train_keys):
rows.append({
'tag_key': tag_key,
'fold': i % folds
})
tag_folds = pd.DataFrame(rows)
tag_folds.head()
Out[ ]:
| tag_key | fold | |
|---|---|---|
| 0 | 202597 | 0 |
| 1 | 159006 | 1 |
| 2 | 205411 | 2 |
| 3 | 229206 | 3 |
| 4 | 172906 | 0 |
In [ ]:
df = (clusters_df['cluster'].value_counts() / clusters_df.shape[0]).reset_index()
allowable_clusters = set(df[df['count'] >= 0.05]['cluster'])
allowable_clusters
Out[ ]:
{'0', '2', '4', '5', '9'}
In [ ]:
from imblearn.over_sampling import RandomOverSampler
df = clusters_df.drop_duplicates(['tag_key', 'epoch'])
features = ['daytime', 'month', 'elevation', 'sunset', 'sunrise']
X_train = df[df['tag_key'].isin(train_keys) & df['cluster'].isin(allowable_clusters)][features + ['tag_key', 'cluster']].copy()
y_train = df[df['tag_key'].isin(train_keys) & df['cluster'].isin(allowable_clusters)]['cluster'].copy().astype(int)
print('Before Resampling:', X_train.shape)
old_shape = X_train.shape
X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)
print('After Resampling:', X_train.shape)
X_train = X_train.sample(old_shape[0])
print('Downsampling:', X_train.shape)
X_train = X_train.merge(tag_folds)
keys_train_array = X_train['tag_key']
fold = np.array(X_train['fold'])
y_train = X_train['cluster'].astype(int).reset_index(drop=True)
X_train = X_train[features].reset_index(drop=True)
X_test = df[df['tag_key'].isin(test_keys) & df['cluster'].isin(allowable_clusters)][features].copy()
y_test = df[df['tag_key'].isin(test_keys) & df['cluster'].isin(allowable_clusters)]['cluster'].copy().astype(int)
split = PredefinedSplit(fold)
model = GridSearchCV(
estimator=RandomForestClassifier(
bootstrap=False, n_jobs=(os.cpu_count() - 2)
),
param_grid={"n_estimators": [20], "min_weight_fraction_leaf": [0.01, 0.05, 0.1]},#"min_samples_leaf": [10, 50, 100, 500, 1000, 5000]},
return_train_score=True,
cv=split,
refit=True,
)
#StratifiedKFold(n_splits=5, shuffle=True, random_state=42),#KFold(n_splits=5, shuffle=True, random_state=42),
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
print('Evaluating...')
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
X_test_rebalanced, y_test_rebalanced = RandomOverSampler().fit_resample(X_test, y_test)
y_pred_test_rebalanced = model.predict(X_test_rebalanced)
print(
round(accuracy_score(y_train, y_pred_train),3),
round(accuracy_score(y_test, y_pred_test), 3),
round(accuracy_score(y_test_rebalanced, y_pred_test_rebalanced), 3)
)
model.best_estimator_
Before Resampling: (764990, 7) After Resampling: (2314610, 7) Downsampling: (764990, 7) Evaluating... 0.461 0.367 0.324
Out[ ]:
RandomForestClassifier(bootstrap=False, min_weight_fraction_leaf=0.01,
n_estimators=20, n_jobs=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(bootstrap=False, min_weight_fraction_leaf=0.01,
n_estimators=20, n_jobs=6)In [ ]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_rebalanced, y_pred_test_rebalanced)
(cm.T / cm.T.sum(axis=0)).T
Out[ ]:
array([[0.43260292, 0.17494706, 0.11424087, 0.19156541, 0.08664374],
[0.12190799, 0.21133022, 0.22839914, 0.17887449, 0.25948816],
[0.18460418, 0.27417659, 0.24567082, 0.23804875, 0.05749966],
[0.22035655, 0.29741826, 0.13560518, 0.3165748 , 0.03004521],
[0.12981542, 0.19273688, 0.1862938 , 0.07851854, 0.41263536]])
In [ ]:
cm = confusion_matrix(y_train, y_pred_train)
(cm.T / cm.T.sum(axis=0)).T
Out[ ]:
array([[0.56606852, 0.06999674, 0.08926591, 0.16788254, 0.1067863 ],
[0.15751252, 0.32812938, 0.11422654, 0.13007939, 0.27005217],
[0.18707267, 0.13257072, 0.35099807, 0.19103871, 0.13831984],
[0.21707628, 0.09799928, 0.11027775, 0.46248898, 0.11215771],
[0.15480477, 0.12474502, 0.0652355 , 0.05695152, 0.59826319]])
In [ ]:
cm = confusion_matrix(y_test, y_pred_test)
(cm.T / cm.T.sum(axis=0)).T
Out[ ]:
array([[0.43161726, 0.17659906, 0.11565263, 0.18939158, 0.08673947],
[0.12113256, 0.21199972, 0.22683083, 0.18042152, 0.25961538],
[0.1849387 , 0.27483946, 0.244892 , 0.23741973, 0.0579101 ],
[0.22111925, 0.29806795, 0.13570953, 0.31472352, 0.03037975],
[0.12981542, 0.19273688, 0.1862938 , 0.07851854, 0.41263536]])
In [ ]:
y_train.value_counts() / y_train.shape[0]
Out[ ]:
cluster 2 0.200705 0 0.200329 5 0.200258 4 0.199408 9 0.199301 Name: count, dtype: float64
In [ ]:
pd.Series(y_pred_train).value_counts() / y_pred_train.shape[0]
Out[ ]:
0 0.256641 9 0.244870 5 0.201801 2 0.150802 4 0.145886 Name: count, dtype: float64
In [ ]:
y_test.value_counts() / y_test.shape[0]
Out[ ]:
cluster 9 0.626015 2 0.132493 0 0.090401 4 0.080528 5 0.070562 Name: count, dtype: float64
In [ ]:
pd.Series(y_pred_test).value_counts() / y_pred_test.shape[0]
Out[ ]:
9 0.307362 2 0.207874 4 0.186428 0 0.166830 5 0.131506 Name: count, dtype: float64
In [ ]:
pd.Series(y_pred_test_rebalanced).value_counts() / y_pred_test_rebalanced.shape[0]
Out[ ]:
2 0.230122 0 0.217857 5 0.200716 4 0.182042 9 0.169262 Name: count, dtype: float64
In [ ]:
print(features)
model.best_estimator_.feature_importances_
['daytime', 'month', 'elevation', 'sunset', 'sunrise']
Out[ ]:
array([0.06200596, 0.23282532, 0.35349933, 0.12604526, 0.22562413])
In [ ]: